data_path = "/Users/onurcanaydin/Desktop/360 proje/ProjectRawData.csv"
daily_data = fread(data_path)
str(daily_data)
## Classes 'data.table' and 'data.frame': 4331 obs. of 13 variables:
## $ event_date : IDate, format: "2021-05-31" "2021-05-31" ...
## $ product_content_id : int 85004 4066298 6676673 7061886 31515569 32737302 32939029 48740784 73318567 85004 ...
## $ price : num 87.4 65.8 120.7 294.4 60.3 ...
## $ sold_count : int 80 1398 345 24 454 44 125 2 175 85 ...
## $ visit_count : int 6002 19102 19578 1485 20114 5017 5301 221 20256 5265 ...
## $ basket_count : int 744 5096 1261 81 2871 316 608 15 965 633 ...
## $ favored_count : int 1642 1703 1510 122 2102 607 698 22 1960 1104 ...
## $ category_sold : int 5048 6547 4944 951 8155 5198 930 1684 5198 4117 ...
## $ category_visits : int 236197 108811 306462 95645 637143 1010634 41973 329087 1010634 215260 ...
## $ category_basket : int 33681 28558 23418 4704 49389 33728 3911 12614 33728 25181 ...
## $ category_favored : int 40472 11913 26597 8886 62460 96699 5791 24534 96699 36225 ...
## $ category_brand_sold: int 743 4286 786 179 1759 3665 875 12 3665 430 ...
## $ ty_visits : int 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 125439876 131821083 ...
## - attr(*, ".internal.selfref")=<externalptr>
daily_data$event_date <- as.Date(daily_data$event_date, "%d.%m.%Y")
We divided ‘daily_data’ into different data tables in terms of products to examine and understand more easily in the future parts.
product1_data <- daily_data[product_content_id == 48740784]
product2_data <- daily_data[product_content_id == 73318567]
product3_data <- daily_data[product_content_id == 32737302]
product4_data <- daily_data[product_content_id == 31515569]
product5_data <- daily_data[product_content_id == 6676673]
product6_data <- daily_data[product_content_id == 7061886]
product7_data <- daily_data[product_content_id == 85004]
product8_data <- daily_data[product_content_id == 4066298]
product9_data <- daily_data[product_content_id == 32939029]
ggplot(product1_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 1 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product1_data$sold_count, lag.max = 365, main = "The ACF of the Product1")
There are high autocorrelation values for some recent lags.There is no obvious trend but there may be a daily seasonality.
ggplot(product2_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 2 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product2_data$sold_count, lag.max = 365, main = "The ACF of the Product2")
There are high and decreasing autocorrelation values for recent lags.Initially Decreasing autocorrelation values show us a trend.
ggplot(product3_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 3 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product3_data$sold_count, lag.max = 365, main = "The ACF of the Product3")
There are high and decreasing autocorrelation values for recent lags.Initially decreasing autocorrelation values show us a trend.While there is a downward trend between May 2020 and July 2020, there is a upward trend after March 2021.Since it is a product used in summer, sales start and increase before summer, and sales decrease towards the end of summer.
ggplot(product4_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 4 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product4_data$sold_count, lag.max = 365, main = "The ACF of the Product4")
When we look at the acf chart of product4, there is a trend, although it is not as obvious as the trend in product2 and 3.The decrease in the initial lags indicates the trend, but since this decrease is only for a few lags, we understand that the trend in sales values is short-term.Also it may have weekly and monthly seasonality.
ggplot(product5_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 5 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product5_data$sold_count, lag.max = 365, main = "The ACF of the Product5")
There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 5. There are significant autocorrelation around lag 30.There is a weekly seasonality, this is probably due to salary days or discount days that is specific days or periods(first week or last week of month) in month.
ggplot(product6_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 6 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product6_data$sold_count, lag.max = 365, main = "The ACF of the Product6")
There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 6.There are significant autocorrelation around lag 16.
ggplot(product7_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 7 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product7_data$sold_count, lag.max = 365, main = "The ACF of the Product7")
There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 7. There are significant autocorrelation around lag 15.
ggplot(product8_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 8 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product8_data$sold_count, lag.max = 365, main = "The ACF of the Product8")
There are high and decreasing autocorrelation values for recent lags ,so there is a trend in sales data of product 8.
ggplot(product9_data, aes(x = event_date, y = sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Product 9 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
acf(product9_data$sold_count, lag.max = 365, main = "The ACF of the Product9")
There are high and decreasing autocorrelation values for many recent lags ,so there is a trend in sales data of product 9.The decrease in the initial lags indicates the trend, but since this decrease is for many lags, we understand that the trend in sales values is long time period.
In general, the sales values of all products include trend or seasonality.Therefore, none of them are stationary.We need to decomposition to make them stationary.Firstly,we did decomposition at daily level.
product1_ts <- ts(product1_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product2_ts <- ts(product2_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product3_ts <- ts(product3_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product4_ts <- ts(product4_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product5_ts <- ts(product5_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product6_ts <- ts(product6_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product7_ts <- ts(product7_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product8_ts <- ts(product8_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product9_ts <- ts(product9_data$sold_count, start = as.Date("2020-05-25"),
end = as.Date("2021-05-31"), frequency = 7)
product1_dec_daily <- decompose(x = product1_ts, type = "additive")
product2_dec_daily <- decompose(x = product2_ts, type = "additive")
product3_dec_daily <- decompose(x = product3_ts, type = "additive")
product4_dec_daily <- decompose(x = product4_ts, type = "additive")
product5_dec_daily <- decompose(x = product5_ts, type = "additive")
product6_dec_daily <- decompose(x = product6_ts, type = "additive")
product7_dec_daily <- decompose(x = product7_ts, type = "additive")
product8_dec_daily <- decompose(x = product8_ts, type = "additive")
product9_dec_daily <- decompose(x = product9_ts, type = "additive")
ACF plot of random component of products that are decomposed at daily level:
acf(product1_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product1")
acf(product2_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product2")
acf(product3_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product3")
acf(product4_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product4")
acf(product5_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product5")
acf(product6_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product6")
acf(product7_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product7")
acf(product8_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product8")
acf(product9_dec_daily$random, na.action = na.pass, lag.max = 365,
main = "The ACF of the Random component for Product9")
There are still high autocorrelation values but they are generally purified from trend and seasonality.
summary(ur.kpss(product1_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0018
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product2_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0041
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product3_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0044
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product4_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0018
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product5_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0019
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product6_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0018
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product7_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0018
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product8_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0019
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product9_dec_daily$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 9 lags.
##
## Value of test-statistic is: 0.0031
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The unit root tests suggest that the random components are stationary.
After daily decomposition,we did decomposition at weekly level.While doing the weekly decomposition, we first found the weekly mean of the daily values.
weekly_product1_data = product1_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product2_data = product2_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product3_data = product3_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product4_data = product4_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product5_data = product5_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product6_data = product6_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product7_data = product7_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product8_data = product8_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product9_data = product9_data[, list(weekly_mean_sold_count = mean(sold_count,
na.rm = T)), by = list(week(event_date), year(event_date))]
weekly_product9_data
## week year weekly_mean_sold_count
## 1: 22 2021 144.750000
## 2: 21 2021 181.571429
## 3: 20 2021 317.714286
## 4: 19 2021 407.285714
## 5: 18 2021 288.000000
## 6: 17 2021 210.857143
## 7: 16 2021 190.142857
## 8: 15 2021 191.000000
## 9: 14 2021 277.142857
## 10: 13 2021 195.000000
## 11: 12 2021 81.000000
## 12: 11 2021 99.428571
## 13: 10 2021 196.428571
## 14: 9 2021 129.714286
## 15: 8 2021 135.714286
## 16: 7 2021 73.428571
## 17: 6 2021 79.285714
## 18: 5 2021 83.857143
## 19: 4 2021 26.571429
## 20: 3 2021 119.571429
## 21: 2 2021 151.857143
## 22: 1 2021 44.714286
## 23: 53 2020 14.500000
## 24: 52 2020 75.142857
## 25: 51 2020 73.285714
## 26: 50 2020 46.571429
## 27: 49 2020 99.714286
## 28: 48 2020 49.142857
## 29: 47 2020 11.571429
## 30: 46 2020 69.000000
## 31: 45 2020 85.142857
## 32: 44 2020 11.571429
## 33: 43 2020 13.142857
## 34: 42 2020 17.857143
## 35: 41 2020 73.142857
## 36: 40 2020 57.571429
## 37: 39 2020 55.285714
## 38: 38 2020 39.857143
## 39: 37 2020 66.571429
## 40: 36 2020 59.857143
## 41: 35 2020 78.285714
## 42: 34 2020 48.285714
## 43: 33 2020 36.571429
## 44: 32 2020 11.571429
## 45: 31 2020 11.571429
## 46: 30 2020 17.571429
## 47: 29 2020 15.142857
## 48: 28 2020 20.000000
## 49: 27 2020 4.000000
## 50: 26 2020 3.571429
## 51: 25 2020 14.571429
## 52: 24 2020 54.428571
## 53: 23 2020 31.285714
## 54: 22 2020 56.857143
## 55: 21 2020 87.500000
## week year weekly_mean_sold_count
Graphs of weekly mean sold counts of products:
ggplot(weekly_product1_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 1 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product2_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 2 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product3_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 3 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product4_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 4 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product5_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 5 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product6_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 6 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product7_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 7 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product8_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 8 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
ggplot(weekly_product9_data, aes(x = week, y = weekly_mean_sold_count)) +
geom_line(colour = "firebrick2", size = 1.5) + labs(title = "Weekly Mean of Product 9 Graph",
x = "Time", y = "Sales quantity") + theme_dark()
The aim is to purify the seasonality that is effect of factors such as price reductions and paydays that occur in the same periods of the months.
weekly_product1_ts <- ts(weekly_product1_data$weekly_mean_sold_count,
frequency = 4)
weekly_product2_ts <- ts(weekly_product2_data$weekly_mean_sold_count,
frequency = 4)
weekly_product3_ts <- ts(weekly_product3_data$weekly_mean_sold_count,
frequency = 4)
weekly_product4_ts <- ts(weekly_product4_data$weekly_mean_sold_count,
frequency = 4)
weekly_product5_ts <- ts(weekly_product5_data$weekly_mean_sold_count,
frequency = 4)
weekly_product6_ts <- ts(weekly_product6_data$weekly_mean_sold_count,
frequency = 4)
weekly_product7_ts <- ts(weekly_product7_data$weekly_mean_sold_count,
frequency = 4)
weekly_product8_ts <- ts(weekly_product8_data$weekly_mean_sold_count,
frequency = 4)
weekly_product9_ts <- ts(weekly_product9_data$weekly_mean_sold_count,
frequency = 4)
product1_dec_weekly <- decompose(x = weekly_product1_ts, type = "additive")
product2_dec_weekly <- decompose(x = weekly_product2_ts, type = "additive")
product3_dec_weekly <- decompose(x = weekly_product3_ts, type = "additive")
product4_dec_weekly <- decompose(x = weekly_product4_ts, type = "additive")
product5_dec_weekly <- decompose(x = weekly_product5_ts, type = "additive")
product6_dec_weekly <- decompose(x = weekly_product6_ts, type = "additive")
product7_dec_weekly <- decompose(x = weekly_product7_ts, type = "additive")
product8_dec_weekly <- decompose(x = weekly_product8_ts, type = "additive")
product9_dec_weekly <- decompose(x = weekly_product9_ts, type = "additive")
ACF plots of random components of products that are decomposed at weekly level:
acf(product1_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product1")
acf(product2_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product2")
acf(product3_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product3")
acf(product4_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product4")
acf(product5_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product5")
acf(product6_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product6")
acf(product7_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product7")
acf(product8_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product8")
acf(product9_dec_weekly$random, na.action = na.pass, lag.max = 52,
main = "The ACF of the Random component for Product9")
There are still high autocorrelation values but they are generally purified from trend and seasonality.
summary(ur.kpss(product1_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0429
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product2_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0916
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product3_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.086
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product4_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0362
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product5_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0738
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product6_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0366
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product7_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.042
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product8_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.0695
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
summary(ur.kpss(product9_dec_weekly$random))
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 3 lags.
##
## Value of test-statistic is: 0.1822
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
The unit root tests suggest that the random components are stationary.However, the random components in the daily decomposition had lower values in the unit root test than those in the weekly decomposition.Therefore, we can say that the random components of the daily decomposition are more stationary.
We also set up the monthly decomposition model, but since the sales data of the products are 1 year, when we make them monthly_ts, they consist of only one observation and cannot be decomposed.
we build arima models with random components of daily decomposition.
product1_arima_model = auto.arima(product1_dec_daily$random,
max.p = 2, max.q = 2)
product1_arima_model
## Series: product1_dec_daily$random
## ARIMA(0,0,1)(0,0,1)[7] with non-zero mean
##
## Coefficients:
## ma1 sma1 mean
## 0.0890 0.2355 -0.0005
## s.e. 0.0284 0.0209 0.0710
##
## sigma^2 estimated as 7.236: log likelihood=-6241.55
## AIC=12491.09 AICc=12491.11 BIC=12514.53
product2_arima_model = auto.arima(product2_dec_daily$random,
max.p = 2, max.q = 2)
product2_arima_model
## Series: product2_dec_daily$random
## ARIMA(2,0,0)(2,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1
## 0.3671 -0.4402 0.2061 0.1235 -0.1754
## s.e. 0.0189 0.0187 0.1113 0.0216 0.1122
##
## sigma^2 estimated as 123.6: log likelihood=-9918.79
## AIC=19849.58 AICc=19849.61 BIC=19884.74
product3_arima_model = auto.arima(product3_dec_daily$random,
max.p = 2, max.q = 2)
product3_arima_model
## Series: product3_dec_daily$random
## ARIMA(2,0,0)(2,0,2)[7] with non-zero mean
##
## Coefficients:
## ar1 ar2 sar1 sar2 sma1 sma2 mean
## 0.2245 -0.3637 1.1567 -0.8305 -1.1510 0.7316 -0.0056
## s.e. 0.0195 0.0196 0.0789 0.0421 0.1016 0.0647 0.0765
##
## sigma^2 estimated as 26.56: log likelihood=-7925.22
## AIC=15866.43 AICc=15866.49 BIC=15913.31
product4_arima_model = auto.arima(product4_dec_daily$random,
max.p = 2, max.q = 2)
product4_arima_model
## Series: product4_dec_daily$random
## ARIMA(1,0,2)(0,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 ma2 sma1
## -0.1484 0.5623 0.1678 0.1263
## s.e. 0.0900 0.0866 0.0332 0.0201
##
## sigma^2 estimated as 315640: log likelihood=-20086.45
## AIC=40182.9 AICc=40182.92 BIC=40212.2
product5_arima_model = auto.arima(product5_dec_daily$random,
max.p = 2, max.q = 2)
product5_arima_model
## Series: product5_dec_daily$random
## ARIMA(1,0,1)(1,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.0229 0.2056 -0.1856 0.2837
## s.e. 0.0530 0.0491 0.1669 0.1632
##
## sigma^2 estimated as 15049: log likelihood=-16142.25
## AIC=32294.49 AICc=32294.52 BIC=32323.79
product6_arima_model = auto.arima(product6_dec_daily$random,
max.p = 2, max.q = 2)
product6_arima_model
## Series: product6_dec_daily$random
## ARIMA(1,0,2)(1,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 ma2 sar1 sma1
## -0.2801 0.5579 0.3024 -0.5928 0.6915
## s.e. 0.0598 0.0547 0.0217 0.0719 0.0643
##
## sigma^2 estimated as 388.7: log likelihood=-11403.31
## AIC=22818.62 AICc=22818.65 BIC=22853.78
product7_arima_model = auto.arima(product7_dec_daily$random,
max.p = 2, max.q = 2)
product7_arima_model
## Series: product7_dec_daily$random
## ARIMA(1,0,2)(2,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 ma2 sar1 sar2 sma1
## -0.5086 0.8848 0.3308 -0.4717 -0.0636 0.5021
## s.e. 0.1020 0.0937 0.0239 0.1614 0.0238 0.1614
##
## sigma^2 estimated as 1046: log likelihood=-12685.84
## AIC=25385.69 AICc=25385.73 BIC=25426.71
product8_arima_model = auto.arima(product8_dec_daily$random,
max.p = 2, max.q = 2)
product8_arima_model
## Series: product8_dec_daily$random
## ARIMA(0,0,1)(0,0,1)[7] with zero mean
##
## Coefficients:
## ma1 sma1
## 0.2999 0.0483
## s.e. 0.0173 0.0193
##
## sigma^2 estimated as 67920: log likelihood=-18096.36
## AIC=36198.71 AICc=36198.72 BIC=36216.29
product9_arima_model = auto.arima(product9_dec_daily$random,
max.p = 2, max.q = 2)
product9_arima_model
## Series: product9_dec_daily$random
## ARIMA(1,0,1)(1,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.0651 0.4105 -0.0801 0.2063
## s.e. 0.0385 0.0325 0.1519 0.1494
##
## sigma^2 estimated as 1137: log likelihood=-12795.32
## AIC=25600.64 AICc=25600.66 BIC=25629.94
In this part, we will be trying to investigate the possible regressors that are likely to increase the quality of our arima models for each product. Some manipulations are made in order to get rid of the missing values in the product datas.
For each product, we will try to find a relationship between residuals and possible regressors by trying them sequentially. We checked the weekly difference values.
##Product1 ARIMAX
product1_data[, `:=`(price_diff, price - shift(price, 1))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(price_diff, res)], col = day(product1_data$event_date))
product1_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(visit_count_diff, res)], col = day(product1_data$event_date))
product1_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(visit_count_diff, res)], col = day(product1_data$event_date))
product1_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(favored_count_diff, res)], col = day(product1_data$event_date))
product1_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
7))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(category_sold_diff, res)], col = day(product1_data$event_date))
product1_data[, `:=`(category_basket_diff, category_basket -
shift(category_basket, 7))]
product1_data[, `:=`(res, c(rep(NA, 7), as.numeric(product1_arima_model$residuals[1:365])))]
plot(product1_data[, list(category_basket_diff, res)], col = day(product1_data$event_date))
There are lots of missing values on the Product1 data. Therefore, it’s hard to find a relationship between regressors and residuals.
product1_data[is.na(price_diff) == T, `:=`(price_diff, 0)]
price_diff_mat_p1 <- matrix(product1_data$price_diff)
arimax_product1 = auto.arima(product1_data$sold_count, xreg = price_diff_mat_p1,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,0,0) with zero mean : 2011.946
## ARIMA(0,0,0) with non-zero mean : 1993.988
## ARIMA(0,0,1) with zero mean : 1947.225
## ARIMA(0,0,1) with non-zero mean : 1936.938
## ARIMA(0,0,2) with zero mean : 1942.397
## ARIMA(0,0,2) with non-zero mean : 1934.058
## ARIMA(0,0,3) with zero mean : 1928.233
## ARIMA(0,0,3) with non-zero mean : 1922.884
## ARIMA(0,0,4) with zero mean : 1927.623
## ARIMA(0,0,4) with non-zero mean : 1921.127
## ARIMA(0,0,5) with zero mean : 1923.841
## ARIMA(0,0,5) with non-zero mean : 1918.159
## ARIMA(1,0,0) with zero mean : 1932.64
## ARIMA(1,0,0) with non-zero mean : 1926.603
## ARIMA(1,0,1) with zero mean : 1931.647
## ARIMA(1,0,1) with non-zero mean : 1927.301
## ARIMA(1,0,2) with zero mean : 1911.999
## ARIMA(1,0,2) with non-zero mean : 1912.7
## ARIMA(1,0,3) with zero mean : 1924.746
## ARIMA(1,0,3) with non-zero mean : 1918.114
## ARIMA(1,0,4) with zero mean : 1926.713
## ARIMA(1,0,4) with non-zero mean : 1920.203
## ARIMA(2,0,0) with zero mean : 1932.882
## ARIMA(2,0,0) with non-zero mean : 1927.751
## ARIMA(2,0,1) with zero mean : 1913.405
## ARIMA(2,0,1) with non-zero mean : 1914.101
## ARIMA(2,0,2) with zero mean : 1908.487
## ARIMA(2,0,2) with non-zero mean : 1909.145
## ARIMA(2,0,3) with zero mean : 1907.64
## ARIMA(2,0,3) with non-zero mean : 1908.355
## ARIMA(3,0,0) with zero mean : 1931.996
## ARIMA(3,0,0) with non-zero mean : 1927.914
## ARIMA(3,0,1) with zero mean : 1922.155
## ARIMA(3,0,1) with non-zero mean : 1917.875
## ARIMA(3,0,2) with zero mean : 1906.838
## ARIMA(3,0,2) with non-zero mean : 1907.562
## ARIMA(4,0,0) with zero mean : 1934.044
## ARIMA(4,0,0) with non-zero mean : 1929.788
## ARIMA(4,0,1) with zero mean : 1923.672
## ARIMA(4,0,1) with non-zero mean : 1918.782
## ARIMA(5,0,0) with zero mean : 1913.727
## ARIMA(5,0,0) with non-zero mean : 1911.807
##
##
##
## Best model: Regression with ARIMA(3,0,2) errors
##Product2 ARIMAX
Product2’s price is constant most of the time. Therefore, it doesn’t make sense to add price as regressor. Instead, we will check the category_sold, visit_count and favored_count as potential regressors. We can’t check the basket_count since it’s full with NA values.
Since we decomposed weekly, we are checking the weekly differences of potential regressors.
###Category_sold
product2_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
7))]
product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]
plot(product2_data[, list(category_sold_diff, res)], col = day(product2_data$event_date))
It doesn’t seem that there is a relationship between residuals and weekly category_sold differencing.
product2_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]
plot(product2_data[, list(visit_count_diff, res)], col = day(product2_data$event_date))
There doesn’t seem a relation again.
product2_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
product2_data[, `:=`(res, c(rep(NA, 7), as.numeric(product2_arima_model$residuals[1:365])))]
plot(product2_data[, list(favored_count_diff, res)], col = day(product2_data$event_date))
We can’t see a clear relationship again. Visually, it looks like visit_count_diff seems best to go with among these 3 alternatives.
product2_data[is.na(visit_count_diff) == T, `:=`(visit_count_diff,
0)]
visit_count_diff_mat_p2 <- matrix(product1_data$visit_count_diff)
arimax_product2 = auto.arima(product2_data$sold_count, xreg = visit_count_diff_mat_p2,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,1,0) : 2878.313
## Regression with ARIMA(0,1,0) errors : 2879.804
## ARIMA(0,1,1) : 2864.49
## Regression with ARIMA(0,1,1) errors : 2865.415
## ARIMA(0,1,2) : 2851.691
## Regression with ARIMA(0,1,2) errors : 2852.139
## ARIMA(0,1,3) : 2852.403
## Regression with ARIMA(0,1,3) errors : 2853.04
## ARIMA(0,1,4) : 2853.9
## Regression with ARIMA(0,1,4) errors : 2854.473
## ARIMA(0,1,5) : 2817.056
## Regression with ARIMA(0,1,5) errors : 2817.206
## ARIMA(1,1,0) : 2871.023
## Regression with ARIMA(1,1,0) errors : 2872.262
## ARIMA(1,1,1) : 2859.361
## Regression with ARIMA(1,1,1) errors : 2859.766
## ARIMA(1,1,2) : 2852.651
## Regression with ARIMA(1,1,2) errors : 2853.212
## ARIMA(1,1,3) : 2854.425
## Regression with ARIMA(1,1,3) errors : 2855.072
## ARIMA(1,1,4) : 2850.092
## Regression with ARIMA(1,1,4) errors : 2850.348
## ARIMA(2,1,0) : 2847.763
## Regression with ARIMA(2,1,0) errors : 2848.405
## ARIMA(2,1,1) : 2846.962
## Regression with ARIMA(2,1,1) errors : 2847.698
## ARIMA(2,1,2) : Inf
## Regression with ARIMA(2,1,2) errors : Inf
## ARIMA(2,1,3) : 2837.214
## Regression with ARIMA(2,1,3) errors : 2838.285
## ARIMA(3,1,0) : 2845.51
## Regression with ARIMA(3,1,0) errors : 2846.365
## ARIMA(3,1,1) : 2847.575
## Regression with ARIMA(3,1,1) errors : 2848.442
## ARIMA(3,1,2) : 2837.905
## Regression with ARIMA(3,1,2) errors : 2838.937
## ARIMA(4,1,0) : 2847.569
## Regression with ARIMA(4,1,0) errors : 2848.44
## ARIMA(4,1,1) : 2844.678
## Regression with ARIMA(4,1,1) errors : 2844.963
## ARIMA(5,1,0) : 2835.786
## Regression with ARIMA(5,1,0) errors : 2836.318
##
##
##
## Best model: Regression with ARIMA(0,1,5) errors
##Product3 ARIMAX
product3_data[, `:=`(price_diff, price - shift(price, 7))]
product3_data[, `:=`(res, c(rep(NA, 7), as.numeric(product3_arima_model$residuals[1:365])))]
plot(product3_data[, list(price_diff, res)], col = day(product3_data$event_date))
product3_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product3_data[, list(visit_count_diff, res)], col = day(product3_data$event_date))
product3_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product3_data[, list(basket_count_diff, res)], col = day(product3_data$event_date))
product3_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product3_data[, list(favored_count_diff, res)], col = day(product3_data$event_date))
We can see a trend in basket_count. Therefore, we will go with it.
product3_data[is.na(basket_count) == T, `:=`(basket_count, 0)]
basket_count_mat_p3 <- matrix(product3_data$basket_count)
arimax_product3 = auto.arima(product3_data$sold_count, xreg = basket_count_mat_p3,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,0,0) with zero mean : 2303.674
## ARIMA(0,0,0) with non-zero mean : 2303.081
## ARIMA(0,0,1) with zero mean : 2221.798
## ARIMA(0,0,1) with non-zero mean : 2221.676
## ARIMA(0,0,2) with zero mean : 2217.999
## ARIMA(0,0,2) with non-zero mean : 2217.773
## ARIMA(0,0,3) with zero mean : 2216.895
## ARIMA(0,0,3) with non-zero mean : 2217.031
## ARIMA(0,0,4) with zero mean : 2216.274
## ARIMA(0,0,4) with non-zero mean : 2216.338
## ARIMA(0,0,5) with zero mean : 2207.519
## ARIMA(0,0,5) with non-zero mean : 2208.101
## ARIMA(1,0,0) with zero mean : 2211.654
## ARIMA(1,0,0) with non-zero mean : 2211.651
## ARIMA(1,0,1) with zero mean : 2213.669
## ARIMA(1,0,1) with non-zero mean : 2213.657
## ARIMA(1,0,2) with zero mean : Inf
## ARIMA(1,0,2) with non-zero mean : 2192.597
## ARIMA(1,0,3) with zero mean : Inf
## ARIMA(1,0,3) with non-zero mean : 2194.232
## ARIMA(1,0,4) with zero mean : Inf
## ARIMA(1,0,4) with non-zero mean : 2192.68
## ARIMA(2,0,0) with zero mean : 2213.687
## ARIMA(2,0,0) with non-zero mean : 2213.687
## ARIMA(2,0,1) with zero mean : Inf
## ARIMA(2,0,1) with non-zero mean : 2199.563
## ARIMA(2,0,2) with zero mean : Inf
## ARIMA(2,0,2) with non-zero mean : 2194.411
## ARIMA(2,0,3) with zero mean : 2193.684
## ARIMA(2,0,3) with non-zero mean : 2185.4
## ARIMA(3,0,0) with zero mean : 2206.848
## ARIMA(3,0,0) with non-zero mean : 2207.107
## ARIMA(3,0,1) with zero mean : Inf
## ARIMA(3,0,1) with non-zero mean : 2191.253
## ARIMA(3,0,2) with zero mean : 2180.082
## ARIMA(3,0,2) with non-zero mean : 2180.692
## ARIMA(4,0,0) with zero mean : 2202.974
## ARIMA(4,0,0) with non-zero mean : 2202.874
## ARIMA(4,0,1) with zero mean : Inf
## ARIMA(4,0,1) with non-zero mean : 2192.681
## ARIMA(5,0,0) with zero mean : 2201.788
## ARIMA(5,0,0) with non-zero mean : 2201.751
##
##
##
## Best model: Regression with ARIMA(3,0,2) errors
##Product4 ARIMAX
product4_data[, `:=`(price_diff, price - shift(price, 7))]
product4_data[, `:=`(res, c(rep(NA, 7), as.numeric(product4_arima_model$residuals[1:365])))]
plot(product4_data[, list(price_diff, res)], col = day(product4_data$event_date))
product4_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product4_data[, list(visit_count_diff, res)], col = day(product4_data$event_date))
product4_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product4_data[, list(basket_count_diff, res)], col = day(product4_data$event_date))
product4_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product4_data[, list(favored_count_diff, res)], col = day(product4_data$event_date))
product4_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
4))]
plot(product4_data[, list(category_sold_diff, res)], col = day(product4_data$event_date))
We see a decreasing residual trend with the increasing category_sold_differencing levels. Notice here the rate is not 7 but 4.
product4_data[is.na(category_sold_diff) == T, `:=`(category_sold_diff,
0)]
category_sold_diff_mat_p4 <- matrix(product4_data$category_sold_diff)
arimax_product4 = auto.arima(product4_data$sold_count, xreg = category_sold_diff_mat_p4,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,1,0) : 5669.004
## Regression with ARIMA(0,1,0) errors : 5671.036
## ARIMA(0,1,1) : 5664.402
## Regression with ARIMA(0,1,1) errors : 5666.445
## ARIMA(0,1,2) : 5665.855
## Regression with ARIMA(0,1,2) errors : 5667.909
## ARIMA(0,1,3) : 5663.487
## Regression with ARIMA(0,1,3) errors : 5665.553
## ARIMA(0,1,4) : 5648.298
## Regression with ARIMA(0,1,4) errors : 5650.366
## ARIMA(0,1,5) : 5615.193
## Regression with ARIMA(0,1,5) errors : 5617.282
## ARIMA(1,1,0) : 5664.959
## Regression with ARIMA(1,1,0) errors : 5667.001
## ARIMA(1,1,1) : 5638.573
## Regression with ARIMA(1,1,1) errors : 5640.628
## ARIMA(1,1,2) : 5640.523
## Regression with ARIMA(1,1,2) errors : 5642.589
## ARIMA(1,1,3) : 5651.508
## Regression with ARIMA(1,1,3) errors : 5653.585
## ARIMA(1,1,4) : 5642.272
## Regression with ARIMA(1,1,4) errors : 5644.355
## ARIMA(2,1,0) : 5666.475
## Regression with ARIMA(2,1,0) errors : 5668.529
## ARIMA(2,1,1) : 5640.504
## Regression with ARIMA(2,1,1) errors : 5642.571
## ARIMA(2,1,2) : 5638.239
## Regression with ARIMA(2,1,2) errors : 5640.569
## ARIMA(2,1,3) : 5637.833
## Regression with ARIMA(2,1,3) errors : 5639.921
## ARIMA(3,1,0) : 5663.66
## Regression with ARIMA(3,1,0) errors : 5665.725
## ARIMA(3,1,1) : 5659.539
## Regression with ARIMA(3,1,1) errors : 5661.614
## ARIMA(3,1,2) : 5632.567
## Regression with ARIMA(3,1,2) errors : 5634.654
## ARIMA(4,1,0) : 5659.181
## Regression with ARIMA(4,1,0) errors : 5661.256
## ARIMA(4,1,1) : 5658.582
## Regression with ARIMA(4,1,1) errors : 5660.668
## ARIMA(5,1,0) : 5655.374
## Regression with ARIMA(5,1,0) errors : 5657.462
##
##
##
## Best model: Regression with ARIMA(0,1,5) errors
##Product5 ARIMAX
product5_data[, `:=`(price_diff, price - shift(price, 4))]
product5_data[, `:=`(res, c(rep(NA, 7), as.numeric(product5_arima_model$residuals[1:365])))]
plot(product5_data[, list(price_diff, res)], col = day(product5_data$event_date))
product5_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
4))]
plot(product5_data[, list(basket_count_diff, res)], col = day(product5_data$event_date))
product5_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
4))]
plot(product5_data[, list(favored_count_diff, res)], col = day(product5_data$event_date))
product5_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
4))]
plot(product5_data[, list(category_sold_diff, res)], col = day(product5_data$event_date))
product5_data[, `:=`(category_brand_sold_diff, category_brand_sold -
shift(category_brand_sold, 4))]
plot(product5_data[, list(category_brand_sold_diff, res)], col = day(product5_data$event_date))
With 4 levels of differencing, we can see trends in residuals.
arima_p5 <- arima(product5_data$sold_count, order = c(1, 0, 1),
seasonal = c(1, 0, 1))
product5_data[, `:=`(residuals, residuals(arima_p5))]
residuals_mat_p5 <- matrix(product5_data$residuals)
arimax_product5 = auto.arima(product5_data$sold_count, xreg = residuals_mat_p5,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,0,0) with zero mean : 5556.707
## ARIMA(0,0,0) with non-zero mean : 4817.509
## ARIMA(0,0,1) with zero mean : Inf
## ARIMA(0,0,1) with non-zero mean : Inf
## ARIMA(0,0,2) with zero mean : 4696.098
## ARIMA(0,0,2) with non-zero mean : Inf
## ARIMA(0,0,3) with zero mean : Inf
## ARIMA(0,0,3) with non-zero mean : Inf
## ARIMA(0,0,4) with zero mean : Inf
## ARIMA(0,0,4) with non-zero mean : Inf
## ARIMA(0,0,5) with zero mean : Inf
## ARIMA(0,0,5) with non-zero mean : Inf
## ARIMA(1,0,0) with zero mean : 4335.941
## ARIMA(1,0,0) with non-zero mean : 4315.347
## ARIMA(1,0,1) with zero mean : Inf
## ARIMA(1,0,1) with non-zero mean : Inf
## ARIMA(1,0,2) with zero mean : Inf
## ARIMA(1,0,2) with non-zero mean : Inf
## ARIMA(1,0,3) with zero mean : Inf
## ARIMA(1,0,3) with non-zero mean : Inf
## ARIMA(1,0,4) with zero mean : Inf
## ARIMA(1,0,4) with non-zero mean : Inf
## ARIMA(2,0,0) with zero mean : 4253.731
## ARIMA(2,0,0) with non-zero mean : 4193.825
## ARIMA(2,0,1) with zero mean : Inf
## ARIMA(2,0,1) with non-zero mean : Inf
## ARIMA(2,0,2) with zero mean : Inf
## ARIMA(2,0,2) with non-zero mean : Inf
## ARIMA(2,0,3) with zero mean : Inf
## ARIMA(2,0,3) with non-zero mean : Inf
## ARIMA(3,0,0) with zero mean : 4181.257
## ARIMA(3,0,0) with non-zero mean : 4153.968
## ARIMA(3,0,1) with zero mean : Inf
## ARIMA(3,0,1) with non-zero mean : Inf
## ARIMA(3,0,2) with zero mean : Inf
## ARIMA(3,0,2) with non-zero mean : Inf
## ARIMA(4,0,0) with zero mean : 4181.667
## ARIMA(4,0,0) with non-zero mean : 4147.894
## ARIMA(4,0,1) with zero mean : Inf
## ARIMA(4,0,1) with non-zero mean : Inf
## ARIMA(5,0,0) with zero mean : 4157.536
## ARIMA(5,0,0) with non-zero mean : 4136.147
##
##
##
## Best model: Regression with ARIMA(5,0,0) errors
##Product6 ARIMAX
product6_data[, `:=`(price_diff, price - shift(price, 7))]
product6_data[, `:=`(res, c(rep(NA, 7), as.numeric(product6_arima_model$residuals[1:365])))]
plot(product6_data[, list(price_diff, res)], col = day(product6_data$event_date))
product6_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product6_data[, list(visit_count_diff, res)], col = day(product6_data$event_date))
product6_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product6_data[, list(basket_count_diff, res)], col = day(product6_data$event_date))
product6_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product6_data[, list(favored_count_diff, res)], col = day(product6_data$event_date))
product6_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
7))]
plot(product6_data[, list(category_sold_diff, res)], col = day(product6_data$event_date))
product6_data[, `:=`(category_brand_sold_diff, category_brand_sold -
shift(category_brand_sold, 7))]
plot(product6_data[, list(category_brand_sold_diff, res)], col = day(product6_data$event_date))
product6_data[is.na(basket_count_diff) == T, `:=`(basket_count_diff,
0)]
basket_count_diff_mat_p6 <- matrix(product6_data$basket_count_diff)
arimax_product6 = auto.arima(product6_data$sold_count, xreg = basket_count_diff_mat_p6,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,1,0) : 3451.13
## Regression with ARIMA(0,1,0) errors : 3453.162
## ARIMA(0,1,1) : 3419.629
## Regression with ARIMA(0,1,1) errors : 3421.67
## ARIMA(0,1,2) : 3395.61
## Regression with ARIMA(0,1,2) errors : 3397.608
## ARIMA(0,1,3) : 3371.232
## Regression with ARIMA(0,1,3) errors : 3373.215
## ARIMA(0,1,4) : 3357.304
## Regression with ARIMA(0,1,4) errors : 3359.275
## ARIMA(0,1,5) : 3357.529
## Regression with ARIMA(0,1,5) errors : 3359.528
## ARIMA(1,1,0) : 3422.638
## Regression with ARIMA(1,1,0) errors : 3424.68
## ARIMA(1,1,1) : 3379.96
## Regression with ARIMA(1,1,1) errors : 3381.922
## ARIMA(1,1,2) : 3381.409
## Regression with ARIMA(1,1,2) errors : 3383.379
## ARIMA(1,1,3) : 3366.678
## Regression with ARIMA(1,1,3) errors : 3368.654
## ARIMA(1,1,4) : 3356.009
## Regression with ARIMA(1,1,4) errors : 3358.003
## ARIMA(2,1,0) : 3424.438
## Regression with ARIMA(2,1,0) errors : 3426.491
## ARIMA(2,1,1) : 3380.865
## Regression with ARIMA(2,1,1) errors : 3382.833
## ARIMA(2,1,2) : 3381.591
## Regression with ARIMA(2,1,2) errors : 3383.569
## ARIMA(2,1,3) : 3342.867
## Regression with ARIMA(2,1,3) errors : 3344.893
## ARIMA(3,1,0) : 3425.571
## Regression with ARIMA(3,1,0) errors : 3427.635
## ARIMA(3,1,1) : 3370.159
## Regression with ARIMA(3,1,1) errors : 3372.153
## ARIMA(3,1,2) : 3358.712
## Regression with ARIMA(3,1,2) errors : 3360.76
## ARIMA(4,1,0) : 3368.522
## Regression with ARIMA(4,1,0) errors : 3370.593
## ARIMA(4,1,1) : 3338.49
## Regression with ARIMA(4,1,1) errors : 3340.532
## ARIMA(5,1,0) : 3358.375
## Regression with ARIMA(5,1,0) errors : 3360.453
##
##
##
## Best model: Regression with ARIMA(4,1,1) errors
##Product7 ARIMAX
product7_data[, `:=`(price_diff, price - shift(price, 7))]
product7_data[, `:=`(res, c(rep(NA, 7), as.numeric(product7_arima_model$residuals[1:365])))]
plot(product7_data[, list(price_diff, res)], col = day(product7_data$event_date))
product7_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product7_data[, list(visit_count_diff, res)], col = day(product7_data$event_date))
product7_data$basket_count
## [1] 744 633 570 397 515 982 534 569 692 616 546 512 507 607 561
## [16] 832 1141 1030 885 1010 688 897 862 908 983 1108 834 874 774 429
## [31] 551 1114 970 866 893 517 413 375 360 313 321 331 380 436 368
## [46] 441 502 409 423 507 702 584 545 583 885 871 799 716 646 583
## [61] 438 421 461 366 449 565 307 263 264 340 316 416 330 285 373
## [76] 364 385 329 241 420 589 796 769 944 548 495 424 402 413 460
## [91] 428 536 627 552 394 442 468 470 583 766 612 482 482 450 403
## [106] 471 432 502 591 344 290 465 445 531 489 428 654 674 1001 579
## [121] 469 410 377 565 964 293 293 398 305 233 258 328 383 499 715
## [136] 608 585 665 866 1338 653 351 217 260 315 335 320 227 297 311
## [151] 207 124 268 311 283 252 167 182 225 637 404 545 461 400 294
## [166] 263 227 230 209 272 286 291 334 313 349 367 272 283 345 197
## [181] 181 192 188 349 544 817 623 542 519 197 187 170 131 176 154
## [196] 170 125 145 239 453 619 828 1021 882 891 252 307 154 130 107
## [211] 139 114 86 122 141 100 111 122 117 115 103 126 149 146 116
## [226] 180 141 165 199 475 154 130 145 195 311 286 342 589 479 164
## [241] 159 158 214 431 157 164 139 143 132 119 133 157 205 200 124
## [256] 117 184 165 200 159 135 206 206 432 393 739 372 91 89 62
## [271] 129 93 117 127 164 126 109 152 98 112 108 106 165 185 529
## [286] 407 377 264 96 91 130 161 139 102 105 97 112 141 124 139
## [301] 161 164 122 128 132 209 70 125 90 80 143 127 92 99 78
## [316] 89 105 140 115 122 80 90 104 99 137 224 365 300 321 159
## [331] 116 107 153 131 166 155 141 158 199 244 203 207 161 171 176
## [346] 146 228 353 365 434 131 140 221 196 279 199 151 159 215 203
## [361] 159 188 185 152 193 135 167 162 168 228 282 233
product7_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product7_data[, list(basket_count_diff, res)], col = day(product7_data$event_date))
product7_data$favored_count
## [1] 1642 1104 913 868 985 932 583 711 1115 1290 917 950 711 827 1088
## [16] 1034 2027 1121 696 1471 858 897 828 1119 1169 1774 906 897 1027 1188
## [31] 535 1661 819 999 1359 689 801 700 470 397 396 465 426 550 548
## [46] 557 567 544 593 573 927 634 584 696 886 1077 1172 894 2254 1730
## [61] 519 917 1337 574 958 2856 643 891 462 487 486 646 412 409 522
## [76] 675 508 438 548 389 777 1115 1063 1062 743 726 825 723 493 521
## [91] 547 691 868 756 681 607 556 647 665 1712 734 648 1218 577 515
## [106] 543 928 844 642 471 419 560 1064 821 762 696 800 605 990 1003
## [121] 1080 823 696 812 1806 846 1096 2593 595 624 617 1059 1060 1339 1452
## [136] 1048 732 989 2198 2882 1140 713 507 612 635 583 962 526 923 689
## [151] 429 235 415 496 468 598 330 298 277 796 772 579 635 636 450
## [166] 371 311 484 382 472 424 642 767 468 657 872 953 368 383 391
## [181] 362 273 287 0 0 0 0 0 0 0 0 0 0 0 0
## [196] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [211] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [226] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [241] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [256] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [271] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [286] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [301] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [331] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [346] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [361] 0 0 0 0 0 0 0 0 0 0 0 0
product7_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product7_data[, list(favored_count_diff, res)], col = day(product7_data$event_date))
We see some degree of trend in visit_count.
product7_data[is.na(visit_count) == T, `:=`(visit_count, 0)]
visit_count_mat_p7 <- matrix(product7_data$visit_count)
arimax_product7 = auto.arima(product7_data$sold_count, xreg = visit_count_mat_p7,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,0,0) with zero mean : 4309.262
## ARIMA(0,0,0) with non-zero mean : 4106.892
## ARIMA(0,0,1) with zero mean : 4058.455
## ARIMA(0,0,1) with non-zero mean : 3920.888
## ARIMA(0,0,2) with zero mean : 3934.093
## ARIMA(0,0,2) with non-zero mean : 3840.657
## ARIMA(0,0,3) with zero mean : 3860.807
## ARIMA(0,0,3) with non-zero mean : 3797.017
## ARIMA(0,0,4) with zero mean : 3837.916
## ARIMA(0,0,4) with non-zero mean : 3790.669
## ARIMA(0,0,5) with zero mean : 3836.41
## ARIMA(0,0,5) with non-zero mean : 3792.748
## ARIMA(1,0,0) with zero mean : 3817.559
## ARIMA(1,0,0) with non-zero mean : 3802.401
## ARIMA(1,0,1) with zero mean : 3819.597
## ARIMA(1,0,1) with non-zero mean : 3803.811
## ARIMA(1,0,2) with zero mean : 3813.72
## ARIMA(1,0,2) with non-zero mean : 3792.356
## ARIMA(1,0,3) with zero mean : 3815.729
## ARIMA(1,0,3) with non-zero mean : 3790.374
## ARIMA(1,0,4) with zero mean : Inf
## ARIMA(1,0,4) with non-zero mean : 3792.409
## ARIMA(2,0,0) with zero mean : 3819.596
## ARIMA(2,0,0) with non-zero mean : 3803.499
## ARIMA(2,0,1) with zero mean : 3820.105
## ARIMA(2,0,1) with non-zero mean : 3805.808
## ARIMA(2,0,2) with zero mean : 3815.77
## ARIMA(2,0,2) with non-zero mean : 3792.118
## ARIMA(2,0,3) with zero mean : 3817.549
## ARIMA(2,0,3) with non-zero mean : 3792.451
## ARIMA(3,0,0) with zero mean : 3816.403
## ARIMA(3,0,0) with non-zero mean : 3793.059
## ARIMA(3,0,1) with zero mean : 3818.062
## ARIMA(3,0,1) with non-zero mean : 3795.013
## ARIMA(3,0,2) with zero mean : 3810.633
## ARIMA(3,0,2) with non-zero mean : 3790.374
## ARIMA(4,0,0) with zero mean : 3817.132
## ARIMA(4,0,0) with non-zero mean : 3794.901
## ARIMA(4,0,1) with zero mean : Inf
## ARIMA(4,0,1) with non-zero mean : 3796.461
## ARIMA(5,0,0) with zero mean : 3811.272
## ARIMA(5,0,0) with non-zero mean : 3794.64
##
##
##
## Best model: Regression with ARIMA(3,0,2) errors
##Product8 ARIMAX
product8_data[, `:=`(price_diff, price - shift(price, 7))]
product8_data[, `:=`(res, c(rep(NA, 7), as.numeric(product8_arima_model$residuals[1:365])))]
plot(product8_data[, list(price_diff, res)], col = day(product8_data$event_date))
product8_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product8_data[, list(visit_count_diff, res)], col = day(product8_data$event_date))
product8_data$basket_count
## [1] 5096 1716 1782 1507 1646 1771 1740 1804 2464 2034 1769 2620 1919 2599 2579
## [16] 560 486 431 376 1123 600 977 413 472 497 651 630 642 663 1053
## [31] 1491 1980 3156 3031 3633 2837 806 1673 4463 2529 823 865 1042 1159 1150
## [46] 1146 1427 1265 1096 1333 1684 1710 1455 2090 2742 3792 6408 2818 707 646
## [61] 932 1233 828 966 1164 942 1156 1355 1797 2162 981 981 906 909 859
## [76] 991 815 809 499 899 1235 2436 3186 4332 1826 726 605 637 576 618
## [91] 614 1002 855 838 702 1010 1722 1857 1253 574 732 448 551 829 993
## [106] 1209 1326 1471 2042 1847 950 746 652 519 593 697 966 1417 1714 677
## [121] 377 373 336 309 489 318 252 233 121 77 266 567 413 323 313
## [136] 321 393 416 457 490 569 395 351 379 391 472 367 336 461 328
## [151] 428 234 269 217 245 226 244 263 810 1317 1547 2230 2123 787 543
## [166] 566 621 689 584 565 619 420 464 422 388 284 334 318 192 953
## [181] 1271 1410 235 507 1195 2387 2189 2116 1307 229 246 287 233 265 269
## [196] 222 268 585 1016 2066 2148 3157 3504 5473 5509 535 669 698 684 379
## [211] 1147 427 290 299 332 387 391 340 327 333 366 382 410 436 447
## [226] 551 501 576 541 588 638 2999 935 1429 2072 5364 3475 4797 1803 419
## [241] 425 406 538 541 400 412 371 420 412 546 628 993 796 309 307
## [256] 283 385 485 978 662 561 698 1574 2791 1939 4953 1504 343 346 280
## [271] 372 511 614 539 434 404 412 227 270 300 503 406 514 951 3808
## [286] 3223 3126 1313 576 487 586 618 837 1298 1110 543 555 569 733 1085
## [301] 618 503 359 277 196 364 627 509 1089 2332 1357 722 519 944 511
## [316] 530 397 391 176 360 361 253 462 428 523 856 1230 712 1517 598
## [331] 329 414 206 317 297 259 284 234 279 342 309 256 260 243 223
## [346] 341 602 764 1045 2101 597 380 350 314 300 377 311 362 274 312
## [361] 313 431 806 312 344 332 292 296 340 399 330 307
product8_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product8_data[, list(basket_count_diff, res)], col = day(product8_data$event_date))
product8_data$favored_count
## [1] 1703 494 509 405 397 409 366 376 485 375 332 465 309 562 581
## [16] 181 146 121 191 631 158 229 106 104 124 179 158 208 222 320
## [31] 324 437 820 830 1579 1050 489 791 1729 493 259 327 320 276 314
## [46] 317 281 303 340 324 408 401 293 530 760 1244 2286 714 240 233
## [61] 334 351 257 352 430 257 292 327 454 597 354 307 219 223 261
## [76] 234 237 244 248 179 332 657 966 1706 750 370 340 372 220 189
## [91] 281 237 319 337 253 285 585 762 317 308 377 228 176 315 414
## [106] 404 328 355 539 627 328 248 205 176 192 257 399 583 1068 617
## [121] 303 294 219 115 148 126 121 97 61 37 137 188 151 121 147
## [136] 134 150 209 225 336 424 187 174 197 207 261 268 187 205 193
## [151] 149 135 144 126 164 166 153 206 325 602 524 794 811 410 418
## [166] 245 273 290 291 273 304 201 253 252 239 143 214 196 137 117
## [181] 143 162 124 0 0 0 0 0 0 0 0 0 0 0 0
## [196] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [211] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [226] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [241] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [256] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [271] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [286] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [301] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [331] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [346] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [361] 0 0 0 0 0 0 0 0 0 0 0 0
product8_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product8_data[, list(favored_count_diff, res)], col = day(product8_data$event_date))
product8_data$category_sold
## [1] 6547 3691 3604 1598 1775 1948 1751 1984 2603 2543 2764 4427
## [13] 2036 3005 2726 1911 1757 1446 990 1519 1815 2442 1415 1351
## [25] 1697 1708 1654 1408 1533 1790 1984 2691 4215 3899 4406 2539
## [37] 1233 1434 2773 2081 1356 1497 1767 1903 1786 1939 2363 1762
## [49] 1633 2108 2270 1919 1777 2577 3587 4058 5182 2557 1302 1134
## [61] 1351 1656 1248 1380 1735 1377 1847 2252 3313 2949 1753 1352
## [73] 1043 1060 1161 1503 1185 1296 1524 1150 1814 3585 3934 4667
## [85] 1800 887 703 866 1048 1181 1194 1465 1856 1588 1426 1767
## [97] 2841 2664 2257 1554 1412 1231 1338 1737 1935 2106 1748 1740
## [109] 1702 1669 1438 1380 1415 1919 1961 2544 3996 4427 5175 2044
## [121] 1119 1074 999 1060 1223 1291 1244 1294 1179 965 1265 1721
## [133] 1580 1686 1680 1758 2034 2222 2258 2297 1578 1435 1422 1458
## [145] 2043 2378 1239 1124 1033 942 958 697 829 813 885 819
## [157] 878 751 1956 5332 6392 3745 1738 896 978 1230 1504 1835
## [169] 1392 1523 1565 867 939 1059 957 905 976 875 631 624
## [181] 601 627 623 805 2738 7031 6391 8825 3328 617 591 562
## [193] 730 745 727 773 798 914 1070 2681 6481 8392 8003 13114
## [205] 3652 793 1311 957 990 1016 1096 1026 959 844 875 958
## [217] 1357 2040 619 667 729 867 1083 979 913 964 830 877
## [229] 1057 818 983 2948 1158 1334 2731 8856 6453 6885 2276 942
## [241] 850 1006 1115 1147 915 907 801 731 707 1444 1463 2045
## [253] 1453 722 673 738 962 1141 1825 1100 643 1034 2197 4346
## [265] 4280 5700 1472 608 627 636 812 1112 850 902 721 641
## [277] 675 582 643 745 809 668 721 1541 7639 8167 4888 1603
## [289] 934 980 983 1258 3082 3007 2259 1021 1168 819 1169 1445
## [301] 827 810 500 323 234 456 605 702 1203 2185 1954 954
## [313] 894 1205 1047 932 686 705 588 776 600 547 955 1178
## [325] 855 1967 3587 2419 4039 1281 404 475 443 547 487 499
## [337] 451 401 342 419 408 404 435 465 348 413 648 858
## [349] 1331 1659 700 450 402 401 446 458 504 517 379 387
## [361] 438 483 855 503 861 518 494 473 550 562 486 413
product8_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
7))]
plot(product8_data[, list(category_sold_diff, res)], col = day(product8_data$event_date))
Category_sold_diff could be added to the ARIMA model.
product8_data[is.na(category_sold_diff) == T, `:=`(category_sold_diff,
0)]
category_sold_mat_p8 <- matrix(product8_data$category_sold_diff)
arimax_product8 = auto.arima(product7_data$sold_count, xreg = category_sold_mat_p8,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,1,0) : 3792.455
## Regression with ARIMA(0,1,0) errors : 3794.486
## ARIMA(0,1,1) : 3789.502
## Regression with ARIMA(0,1,1) errors : 3791.542
## ARIMA(0,1,2) : 3785.631
## Regression with ARIMA(0,1,2) errors : 3787.677
## ARIMA(0,1,3) : 3750.039
## Regression with ARIMA(0,1,3) errors : 3751.343
## ARIMA(0,1,4) : 3737.334
## Regression with ARIMA(0,1,4) errors : Inf
## ARIMA(0,1,5) : 3737.849
## Regression with ARIMA(0,1,5) errors : Inf
## ARIMA(1,1,0) : 3790.03
## Regression with ARIMA(1,1,0) errors : 3792.071
## ARIMA(1,1,1) : 3741.426
## Regression with ARIMA(1,1,1) errors : Inf
## ARIMA(1,1,2) : 3742.646
## Regression with ARIMA(1,1,2) errors : Inf
## ARIMA(1,1,3) : 3736.762
## Regression with ARIMA(1,1,3) errors : Inf
## ARIMA(1,1,4) : 3737.874
## Regression with ARIMA(1,1,4) errors : Inf
## ARIMA(2,1,0) : 3791.643
## Regression with ARIMA(2,1,0) errors : 3793.694
## ARIMA(2,1,1) : 3742.302
## Regression with ARIMA(2,1,1) errors : Inf
## ARIMA(2,1,2) : 3743.93
## Regression with ARIMA(2,1,2) errors : Inf
## ARIMA(2,1,3) : 3738.106
## Regression with ARIMA(2,1,3) errors : Inf
## ARIMA(3,1,0) : 3780.262
## Regression with ARIMA(3,1,0) errors : 3782.323
## ARIMA(3,1,1) : 3737.265
## Regression with ARIMA(3,1,1) errors : Inf
## ARIMA(3,1,2) : 3739.262
## Regression with ARIMA(3,1,2) errors : Inf
## ARIMA(4,1,0) : 3768.255
## Regression with ARIMA(4,1,0) errors : 3770.326
## ARIMA(4,1,1) : 3739.184
## Regression with ARIMA(4,1,1) errors : Inf
## ARIMA(5,1,0) : 3764.502
## Regression with ARIMA(5,1,0) errors : 3766.582
##
##
##
## Best model: Regression with ARIMA(1,1,3) errors
##Product9 ARIMAX
product9_data[, `:=`(price_diff, price - shift(price, 7))]
product9_data[, `:=`(res, c(rep(NA, 7), as.numeric(product9_arima_model$residuals[1:365])))]
plot(product9_data[, list(price_diff, res)], col = day(product9_data$event_date))
product9_data[, `:=`(visit_count_diff, visit_count - shift(visit_count,
7))]
plot(product9_data[, list(visit_count_diff, res)], col = day(product9_data$event_date))
product9_data[, `:=`(basket_count_diff, basket_count - shift(basket_count,
7))]
plot(product9_data[, list(basket_count_diff, res)], col = day(product9_data$event_date))
product9_data$favored_count
## [1] 698 609 624 608 869 859 525 524 745 1001 812 801 651 834 964
## [16] 1200 1160 936 1024 1465 2076 2448 1780 1210 1262 840 738 768 1394 983
## [31] 885 941 790 1080 1093 1559 1685 1560 1706 593 899 795 567 927 655
## [46] 1148 1388 546 454 718 870 671 474 564 1086 2119 2696 1920 747 529
## [61] 929 872 789 748 674 437 352 239 175 548 457 317 235 345 493
## [76] 443 948 181 159 181 381 973 1427 1528 1511 1106 1645 2085 633 755
## [91] 1599 873 940 551 417 452 674 1326 676 689 621 519 302 346 281
## [106] 331 432 342 290 220 269 296 728 968 436 307 442 614 720 408
## [121] 319 212 186 78 162 261 237 222 160 187 353 510 557 465 588
## [136] 518 921 996 1155 1513 802 676 688 507 446 396 199 101 228 229
## [151] 168 86 88 121 105 180 216 244 745 1990 1213 541 255 290 233
## [166] 275 261 221 212 344 435 298 277 299 463 692 1395 788 269 209
## [181] 191 145 117 0 0 0 0 0 0 0 0 0 0 0 0
## [196] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [211] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [226] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [241] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [256] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [271] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [286] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [301] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [316] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [331] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [346] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [361] 0 0 0 0 0 0 0 0 0 0 0 0
product9_data[, `:=`(favored_count_diff, favored_count - shift(favored_count,
7))]
plot(product9_data[, list(favored_count_diff, res)], col = day(product9_data$event_date))
product9_data$category_sold
## [1] 930 911 895 807 800 835 698 778 806 926 806 842 802 1030 1180
## [16] 1304 1142 1048 981 1107 1398 1537 1341 1358 1437 1485 1338 1111 1159 949
## [31] 1090 1576 2078 2144 2006 1042 836 971 1049 709 1023 928 1017 968 953
## [46] 1063 1290 943 844 884 1121 939 928 1265 1766 1866 1768 1271 693 613
## [61] 829 899 741 767 819 685 726 847 1280 1384 816 735 729 872 904
## [76] 1077 1109 966 863 801 1539 3132 4031 2462 1252 683 598 615 640 970
## [91] 909 726 705 732 638 719 1186 1218 1081 1080 1128 1091 984 1281 1241
## [106] 1311 1379 1675 1479 1260 1160 1115 1099 1197 1311 2121 3691 3891 2792 1193
## [121] 652 650 610 668 767 930 979 855 856 718 757 772 826 760 916
## [136] 953 1207 1319 1795 1551 939 908 904 893 833 918 778 720 717 659
## [151] 596 551 548 610 600 724 825 867 1713 3414 3413 3043 829 697 707
## [166] 836 917 984 899 1147 1239 834 684 737 706 778 994 1004 659 627
## [181] 656 512 576 845 2247 5557 4145 5139 1436 477 497 408 437 419 488
## [196] 529 638 753 779 1951 4472 4908 3663 4708 1509 329 399 366 448 479
## [211] 452 429 350 344 406 391 418 434 421 454 419 514 559 560 623
## [226] 661 558 605 632 747 544 620 617 625 1301 2527 2103 2554 1073 536
## [241] 406 480 625 736 708 740 703 761 789 882 1142 1233 1528 641 668
## [256] 624 746 783 951 804 607 721 1531 4096 3744 3185 1018 560 469 500
## [271] 548 553 800 770 431 415 481 571 590 652 663 560 629 1218 2986
## [286] 1770 1799 1118 835 625 593 425 472 480 505 455 520 498 582 626
## [301] 733 807 659 548 445 560 648 433 378 598 553 504 400 483 525
## [316] 466 442 361 514 596 539 468 456 452 480 742 1524 1271 1486 587
## [331] 321 360 646 507 444 461 485 500 441 389 459 559 563 520 543
## [346] 597 962 1438 1291 1437 962 718 662 833 907 1021 799 778 712 725
## [361] 823 981 1210 809 809 902 842 810 927 1071 1351 1193
product9_data[, `:=`(category_sold_diff, category_sold - shift(category_sold,
7))]
plot(product9_data[, list(category_sold_diff, res)], col = day(product9_data$event_date))
product9_data$category_brand_sold
## [1] 875 866 857 3370 3946 3769 3019 3144 4260 5373 4010 3587
## [13] 3444 5287 4543 5925 5366 4884 4265 4992 6454 6717 5763 5658
## [25] 5647 5043 5341 4254 6758 5354 5412 8649 9786 15283 9991 7894
## [37] 7579 9130 9528 3763 4897 4380 4702 7122 5424 5227 6516 4603
## [49] 3285 3862 5379 4182 3513 3898 6101 10600 8913 7840 3770 2818
## [61] 3241 3537 4709 4997 4572 3138 3631 4388 5156 6120 4295 4467
## [73] 3935 4887 5470 5329 5887 5608 4245 3123 5821 12868 16442 11074
## [85] 8138 6196 6732 6619 5442 5878 8359 5416 5346 4345 3986 4346
## [97] 7195 8083 7374 9184 7628 7868 6117 6621 6542 7542 9416 12180
## [109] 9697 8022 6420 5476 7309 8772 9898 9637 18425 28944 17260 9761
## [121] 8319 6084 5360 3574 4776 7846 7310 7446 4938 4521 4868 4851
## [133] 4091 3755 5106 5005 6628 6611 9282 8788 7464 6527 6083 5639
## [145] 6919 9471 6224 5068 7733 6872 5040 3380 3673 4732 4541 6005
## [157] 5854 5649 10710 21292 23931 17068 9159 6429 5016 5609 5089 5421
## [169] 5345 6095 7110 4513 4651 4271 4232 5039 7355 6527 5288 7799
## [181] 4276 3441 3989 0 0 0 0 0 0 0 0 0
## [193] 0 0 0 0 0 0 0 0 0 0 0 0
## [205] 0 0 0 0 0 0 0 0 0 0 0 0
## [217] 0 0 0 0 0 0 0 0 0 0 0 0
## [229] 0 0 0 0 0 0 0 0 0 0 0 0
## [241] 0 0 0 0 0 0 0 0 0 0 0 0
## [253] 0 0 0 0 0 0 0 0 0 0 0 0
## [265] 0 0 0 0 0 0 0 0 0 0 0 0
## [277] 0 0 0 0 0 0 0 0 0 0 0 0
## [289] 0 0 0 0 0 0 0 0 0 0 0 0
## [301] 0 0 0 0 0 0 0 0 0 0 0 0
## [313] 0 0 0 0 0 0 0 0 0 0 0 0
## [325] 0 0 0 0 0 0 0 0 0 0 0 0
## [337] 0 0 0 0 0 0 0 0 0 0 0 0
## [349] 0 0 0 0 0 0 0 0 0 0 0 0
## [361] 0 0 0 0 0 0 0 0 0 0 0 0
product9_data[, `:=`(category_brand_sold_diff, category_brand_sold -
shift(category_brand_sold, 7))]
plot(product9_data[, list(category_brand_sold_diff, res)], col = day(product9_data$event_date))
We can use price_diff as a possible regressor.
product9_data[is.na(price_diff) == T, `:=`(price_diff, 0)]
price_diff_mat_p9 <- matrix(product9_data$price_diff)
arimax_product9 = auto.arima(product9_data$sold_count, xreg = price_diff_mat_p9,
seasonal = T, trace = T, stepwise = F, approximation = F)
##
## ARIMA(0,1,0) : 3860.38
## Regression with ARIMA(0,1,0) errors : 3862.407
## ARIMA(0,1,1) : 3856.398
## Regression with ARIMA(0,1,1) errors : 3858.433
## ARIMA(0,1,2) : 3841.621
## Regression with ARIMA(0,1,2) errors : 3843.648
## ARIMA(0,1,3) : 3837.92
## Regression with ARIMA(0,1,3) errors : 3839.925
## ARIMA(0,1,4) : 3835.305
## Regression with ARIMA(0,1,4) errors : 3837.227
## ARIMA(0,1,5) : 3833.391
## Regression with ARIMA(0,1,5) errors : 3834.468
## ARIMA(1,1,0) : 3858.434
## Regression with ARIMA(1,1,0) errors : 3860.471
## ARIMA(1,1,1) : 3828.174
## Regression with ARIMA(1,1,1) errors : 3827.908
## ARIMA(1,1,2) : 3829.703
## Regression with ARIMA(1,1,2) errors : 3829.588
## ARIMA(1,1,3) : 3831.471
## Regression with ARIMA(1,1,3) errors : 3831.228
## ARIMA(1,1,4) : 3833.551
## Regression with ARIMA(1,1,4) errors : 3833.253
## ARIMA(2,1,0) : 3851.833
## Regression with ARIMA(2,1,0) errors : 3853.879
## ARIMA(2,1,1) : 3829.744
## Regression with ARIMA(2,1,1) errors : 3829.624
## ARIMA(2,1,2) : 3832.095
## Regression with ARIMA(2,1,2) errors : 3831.843
## ARIMA(2,1,3) : 3833.378
## Regression with ARIMA(2,1,3) errors : 3833.317
## ARIMA(3,1,0) : 3850.398
## Regression with ARIMA(3,1,0) errors : 3852.453
## ARIMA(3,1,1) : 3831.545
## Regression with ARIMA(3,1,1) errors : 3831.324
## ARIMA(3,1,2) : 3833.47
## Regression with ARIMA(3,1,2) errors : 3833.396
## ARIMA(4,1,0) : 3843.654
## Regression with ARIMA(4,1,0) errors : 3845.709
## ARIMA(4,1,1) : 3836.206
## Regression with ARIMA(4,1,1) errors : 3838.235
## ARIMA(5,1,0) : 3836.589
## Regression with ARIMA(5,1,0) errors : 3838.636
##
##
##
## Best model: Regression with ARIMA(1,1,1) errors
#Forecast
To compare arima and arimax models, we compare them for test period which is from 2021-05-11 to 2021-05-29.
train_start = as.Date("2020-05-25")
test_start = as.Date("2021-05-11")
test_end = as.Date("2021-05-29")
train_date = seq(train_start, test_start, by = "day")
str(train_date)
## Date[1:352], format: "2020-05-25" "2020-05-26" "2020-05-27" "2020-05-28" "2020-05-29" ...
test_dates = seq(test_start, test_end, by = "day")
test_dates
## [1] "2021-05-11" "2021-05-12" "2021-05-13" "2021-05-14" "2021-05-15"
## [6] "2021-05-16" "2021-05-17" "2021-05-18" "2021-05-19" "2021-05-20"
## [11] "2021-05-21" "2021-05-22" "2021-05-23" "2021-05-24" "2021-05-25"
## [16] "2021-05-26" "2021-05-27" "2021-05-28" "2021-05-29"
##Product 1
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(0, 0, 1), seasonal = c(0,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product1
# forecasted=forecast(fitted,xreg=price_diff_mat_p1[239],
# h=nahead)
forecasted = forecast(fitted, xreg = tail(price_diff_mat_p1[!is.na(price_diff_mat_p1)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product1_data[event_date <= current_date]
forecast_data = product1_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 3 arima_prediction 5.404704e-01
## 2: 2021-05-12 5 arima_prediction 5.479946e-01
## 3: 2021-05-13 1 arima_prediction 5.559558e-01
## 4: 2021-05-14 1 arima_prediction 5.531254e-01
## 5: 2021-05-15 1 arima_prediction 5.546790e-01
## 6: 2021-05-16 1 arima_prediction 5.548164e-01
## 7: 2021-05-17 3 arima_prediction 5.552911e-01
## 8: 2021-05-18 2 arima_prediction 5.616536e-01
## 9: 2021-05-19 1 arima_prediction 5.621574e-01
## 10: 2021-05-20 0 arima_prediction 5.620824e-01
## 11: 2021-05-21 1 arima_prediction 5.598573e-01
## 12: 2021-05-22 0 arima_prediction 5.612916e-01
## 13: 2021-05-23 0 arima_prediction 5.587257e-01
## 14: 2021-05-24 3 arima_prediction 5.574481e-01
## 15: 2021-05-25 2 arima_prediction 5.645349e-01
## 16: 2021-05-26 4 arima_prediction 5.647959e-01
## 17: 2021-05-27 1 arima_prediction 5.728619e-01
## 18: 2021-05-28 1 arima_prediction 5.706718e-01
## 19: 2021-05-29 3 arima_prediction 5.719171e-01
## 20: 2021-05-11 3 arimax_prediction 1.538306e-06
## 21: 2021-05-12 5 arimax_prediction 1.538306e-06
## 22: 2021-05-13 1 arimax_prediction 1.538306e-06
## 23: 2021-05-14 1 arimax_prediction 1.538306e-06
## 24: 2021-05-15 1 arimax_prediction 1.538306e-06
## 25: 2021-05-16 1 arimax_prediction 1.538306e-06
## 26: 2021-05-17 3 arimax_prediction 1.538306e-06
## 27: 2021-05-18 2 arimax_prediction 1.538306e-06
## 28: 2021-05-19 1 arimax_prediction 1.538306e-06
## 29: 2021-05-20 0 arimax_prediction 1.538306e-06
## 30: 2021-05-21 1 arimax_prediction 1.538306e-06
## 31: 2021-05-22 0 arimax_prediction 1.538306e-06
## 32: 2021-05-23 0 arimax_prediction 1.538306e-06
## 33: 2021-05-24 3 arimax_prediction 1.538306e-06
## 34: 2021-05-25 2 arimax_prediction 1.538306e-06
## 35: 2021-05-26 4 arimax_prediction 1.538306e-06
## 36: 2021-05-27 1 arimax_prediction 1.538306e-06
## 37: 2021-05-28 1 arimax_prediction 1.538306e-06
## 38: 2021-05-29 3 arimax_prediction 1.538306e-06
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE RMSE
## 1: arima_prediction 19 1.736842 1.407997 0.8106651 0.6778688 Inf 1.808265
## 2: arimax_prediction 19 1.736842 1.407997 0.8106651 0.9999991 Inf 2.212404
## MAD MADP WMAPE
## 1: 1.354414 0.7798142 0.7798142
## 2: 1.736841 0.9999994 0.9999994
View of data shows that arimax prediction is better in terms of WMAPE.
##Product 2
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(2, 0, 0), seasonal = c(2,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product2
forecasted = forecast(fitted, xreg = tail(visit_count_diff_mat_p2[!is.na(visit_count_diff_mat_p2)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product2_data[event_date <= current_date]
forecast_data = product2_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 112 arima_prediction 3.976428e-01
## 2: 2021-05-12 202 arima_prediction 4.212251e-01
## 3: 2021-05-13 142 arima_prediction 3.379453e-01
## 4: 2021-05-14 170 arima_prediction 4.359351e-01
## 5: 2021-05-15 242 arima_prediction 4.414281e-01
## 6: 2021-05-16 173 arima_prediction 1.037243e+00
## 7: 2021-05-17 107 arima_prediction 1.324535e-01
## 8: 2021-05-18 133 arima_prediction 5.940787e-01
## 9: 2021-05-19 109 arima_prediction 6.176498e-01
## 10: 2021-05-20 102 arima_prediction 6.170576e-01
## 11: 2021-05-21 101 arima_prediction 6.027053e-01
## 12: 2021-05-22 186 arima_prediction 1.382863e+00
## 13: 2021-05-23 231 arima_prediction 7.417304e-01
## 14: 2021-05-24 181 arima_prediction 6.425408e-01
## 15: 2021-05-25 177 arima_prediction 6.438554e-01
## 16: 2021-05-26 172 arima_prediction 3.742800e-01
## 17: 2021-05-27 214 arima_prediction 8.557031e-01
## 18: 2021-05-28 194 arima_prediction 4.385175e-01
## 19: 2021-05-29 228 arima_prediction 7.804554e-01
## 20: 2021-05-11 112 arimax_prediction -1.940188e-19
## 21: 2021-05-12 202 arimax_prediction -1.940188e-19
## 22: 2021-05-13 142 arimax_prediction -1.940188e-19
## 23: 2021-05-14 170 arimax_prediction -1.940188e-19
## 24: 2021-05-15 242 arimax_prediction -1.940188e-19
## 25: 2021-05-16 173 arimax_prediction -1.940188e-19
## 26: 2021-05-17 107 arimax_prediction -1.940188e-19
## 27: 2021-05-18 133 arimax_prediction -1.940188e-19
## 28: 2021-05-19 109 arimax_prediction -1.940188e-19
## 29: 2021-05-20 102 arimax_prediction -1.940188e-19
## 30: 2021-05-21 101 arimax_prediction -1.940188e-19
## 31: 2021-05-22 186 arimax_prediction -1.940188e-19
## 32: 2021-05-23 231 arimax_prediction -1.940188e-19
## 33: 2021-05-24 181 arimax_prediction -1.940188e-19
## 34: 2021-05-25 177 arimax_prediction -1.940188e-19
## 35: 2021-05-26 172 arimax_prediction -1.940188e-19
## 36: 2021-05-27 214 arimax_prediction -1.940188e-19
## 37: 2021-05-28 194 arimax_prediction -1.940188e-19
## 38: 2021-05-29 228 arimax_prediction -1.940188e-19
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 167.1579 46.35762 0.2773284 0.9963806 0.9962379
## 2: arimax_prediction 19 167.1579 46.35762 0.2773284 1.0000000 1.0000000
## RMSE MAD MADP WMAPE
## 1: 172.5354 166.5529 0.9963806 0.9963806
## 2: 173.1406 167.1579 1.0000000 1.0000000
View of data shows that arima prediction is better in terms of MAPE.
#Product 3
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(2, 0, 0), seasonal = c(2,
0, 2))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product3
forecasted = forecast(fitted, xreg = tail(basket_count_mat_p3[!is.na(basket_count_mat_p3)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product3_data[event_date <= current_date]
forecast_data = product3_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 35 arima_prediction 37.60139
## 2: 2021-05-12 53 arima_prediction 38.79232
## 3: 2021-05-13 31 arima_prediction 38.02713
## 4: 2021-05-14 59 arima_prediction 38.80118
## 5: 2021-05-15 66 arima_prediction 38.62039
## 6: 2021-05-16 46 arima_prediction 39.33641
## 7: 2021-05-17 24 arima_prediction 37.92912
## 8: 2021-05-18 37 arima_prediction 37.77184
## 9: 2021-05-19 40 arima_prediction 37.76956
## 10: 2021-05-20 37 arima_prediction 37.77880
## 11: 2021-05-21 34 arima_prediction 37.69812
## 12: 2021-05-22 41 arima_prediction 37.68953
## 13: 2021-05-23 45 arima_prediction 37.69094
## 14: 2021-05-24 31 arima_prediction 39.04140
## 15: 2021-05-25 40 arima_prediction 37.60958
## 16: 2021-05-26 46 arima_prediction 37.66310
## 17: 2021-05-27 41 arima_prediction 37.63778
## 18: 2021-05-28 39 arima_prediction 39.03608
## 19: 2021-05-29 55 arima_prediction 37.59124
## 20: 2021-05-11 35 arimax_prediction 39.21218
## 21: 2021-05-12 53 arimax_prediction 39.21218
## 22: 2021-05-13 31 arimax_prediction 39.21218
## 23: 2021-05-14 59 arimax_prediction 39.21218
## 24: 2021-05-15 66 arimax_prediction 39.21218
## 25: 2021-05-16 46 arimax_prediction 39.21218
## 26: 2021-05-17 24 arimax_prediction 39.21218
## 27: 2021-05-18 37 arimax_prediction 39.21218
## 28: 2021-05-19 40 arimax_prediction 39.21218
## 29: 2021-05-20 37 arimax_prediction 39.21218
## 30: 2021-05-21 34 arimax_prediction 39.21218
## 31: 2021-05-22 41 arimax_prediction 39.21218
## 32: 2021-05-23 45 arimax_prediction 39.21218
## 33: 2021-05-24 31 arimax_prediction 39.21218
## 34: 2021-05-25 40 arimax_prediction 39.21218
## 35: 2021-05-26 46 arimax_prediction 39.21218
## 36: 2021-05-27 41 arimax_prediction 39.21218
## 37: 2021-05-28 39 arimax_prediction 39.21218
## 38: 2021-05-29 55 arimax_prediction 39.21218
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 42.10526 10.39174 0.2468039 0.09489262 0.1789984
## 2: arimax_prediction 19 42.10526 10.39174 0.2468039 0.06871078 0.1790068
## RMSE MAD MADP WMAPE
## 1: 10.74232 7.877993 0.1871023 0.1871023
## 2: 10.52020 7.703340 0.1829543 0.1829543
View of data shows that arima prediction is better in terms of MAPE.
#Product 4
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(0,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product4
forecasted = forecast(fitted, xreg = tail(category_sold_diff_mat_p4[!is.na(category_sold_diff_mat_p4)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product4_data[event_date <= current_date]
forecast_data = product4_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 363 arima_prediction 939.5411
## 2: 2021-05-12 522 arima_prediction 939.2181
## 3: 2021-05-13 560 arima_prediction 938.9446
## 4: 2021-05-14 512 arima_prediction 938.4145
## 5: 2021-05-15 550 arima_prediction 938.4417
## 6: 2021-05-16 530 arima_prediction 937.9748
## 7: 2021-05-17 293 arima_prediction 937.5866
## 8: 2021-05-18 301 arima_prediction 937.3904
## 9: 2021-05-19 273 arima_prediction 936.6304
## 10: 2021-05-20 255 arima_prediction 936.0824
## 11: 2021-05-21 229 arima_prediction 935.6306
## 12: 2021-05-22 261 arima_prediction 935.0349
## 13: 2021-05-23 284 arima_prediction 934.4551
## 14: 2021-05-24 240 arima_prediction 934.0283
## 15: 2021-05-25 216 arima_prediction 933.6112
## 16: 2021-05-26 236 arima_prediction 932.9394
## 17: 2021-05-27 281 arima_prediction 932.3776
## 18: 2021-05-28 240 arima_prediction 931.9122
## 19: 2021-05-29 316 arima_prediction 931.4979
## 20: 2021-05-11 363 arimax_prediction 411.5221
## 21: 2021-05-12 522 arimax_prediction 411.5221
## 22: 2021-05-13 560 arimax_prediction 411.5221
## 23: 2021-05-14 512 arimax_prediction 411.5221
## 24: 2021-05-15 550 arimax_prediction 411.5221
## 25: 2021-05-16 530 arimax_prediction 411.5221
## 26: 2021-05-17 293 arimax_prediction 411.5221
## 27: 2021-05-18 301 arimax_prediction 411.5221
## 28: 2021-05-19 273 arimax_prediction 411.5221
## 29: 2021-05-20 255 arimax_prediction 411.5221
## 30: 2021-05-21 229 arimax_prediction 411.5221
## 31: 2021-05-22 261 arimax_prediction 411.5221
## 32: 2021-05-23 284 arimax_prediction 411.5221
## 33: 2021-05-24 240 arimax_prediction 411.5221
## 34: 2021-05-25 216 arimax_prediction 411.5221
## 35: 2021-05-26 236 arimax_prediction 411.5221
## 36: 2021-05-27 281 arimax_prediction 411.5221
## 37: 2021-05-28 240 arimax_prediction 411.5221
## 38: 2021-05-29 316 arimax_prediction 411.5221
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 340.1053 124.5498 0.3662096 -1.7517350 2.0559821
## 2: arimax_prediction 19 340.1053 124.5498 0.3662096 -0.2099844 0.4654088
## RMSE MAD MADP WMAPE
## 1: 607.6258 595.7743 1.7517350 1.7517350
## 2: 140.7002 136.2999 0.4007581 0.4007581
View of data shows that arimax prediction is better in terms of MAPE.
#Product 5
We define our functions for arima and arimax models.
product5_arima_model
## Series: product5_dec_daily$random
## ARIMA(1,0,1)(1,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.0229 0.2056 -0.1856 0.2837
## s.e. 0.0530 0.0491 0.1669 0.1632
##
## sigma^2 estimated as 15049: log likelihood=-16142.25
## AIC=32294.49 AICc=32294.52 BIC=32323.79
arimax_product5
## Series: product5_data$sold_count
## Regression with ARIMA(5,0,0) errors
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 intercept xreg
## 1.6023 -1.2325 0.7710 -0.4495 0.1912 393.3038 0.6316
## s.e. 0.0510 0.0952 0.1075 0.0948 0.0509 26.6493 0.0081
##
## sigma^2 estimated as 3821: log likelihood=-2059.88
## AIC=4135.75 AICc=4136.15 BIC=4167.1
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(1, 0, 1), seasonal = c(1,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product5
forecasted = forecast(fitted, xreg = tail(residuals_mat_p5[!is.na(residuals_mat_p5)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product5_data[event_date <= current_date]
forecast_data = product5_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 149 arima_prediction 509.9902
## 2: 2021-05-12 389 arima_prediction 506.8503
## 3: 2021-05-13 511 arima_prediction 508.3720
## 4: 2021-05-14 610 arima_prediction 507.9804
## 5: 2021-05-15 473 arima_prediction 507.9171
## 6: 2021-05-16 496 arima_prediction 507.7354
## 7: 2021-05-17 440 arima_prediction 507.9670
## 8: 2021-05-18 371 arima_prediction 507.8527
## 9: 2021-05-19 303 arima_prediction 507.4218
## 10: 2021-05-20 396 arima_prediction 507.2979
## 11: 2021-05-21 178 arima_prediction 507.4749
## 12: 2021-05-22 535 arima_prediction 507.3865
## 13: 2021-05-23 562 arima_prediction 508.6232
## 14: 2021-05-24 384 arima_prediction 508.3583
## 15: 2021-05-25 280 arima_prediction 507.9776
## 16: 2021-05-26 233 arima_prediction 507.1355
## 17: 2021-05-27 268 arima_prediction 506.7011
## 18: 2021-05-28 258 arima_prediction 506.6841
## 19: 2021-05-29 286 arima_prediction 506.7034
## 20: 2021-05-11 149 arimax_prediction 437.6150
## 21: 2021-05-12 389 arimax_prediction 437.6150
## 22: 2021-05-13 511 arimax_prediction 437.6150
## 23: 2021-05-14 610 arimax_prediction 437.6150
## 24: 2021-05-15 473 arimax_prediction 437.6150
## 25: 2021-05-16 496 arimax_prediction 437.6150
## 26: 2021-05-17 440 arimax_prediction 437.6150
## 27: 2021-05-18 371 arimax_prediction 437.6150
## 28: 2021-05-19 303 arimax_prediction 437.6150
## 29: 2021-05-20 396 arimax_prediction 437.6150
## 30: 2021-05-21 178 arimax_prediction 437.6150
## 31: 2021-05-22 535 arimax_prediction 437.6150
## 32: 2021-05-23 562 arimax_prediction 437.6150
## 33: 2021-05-24 384 arimax_prediction 437.6150
## 34: 2021-05-25 280 arimax_prediction 437.6150
## 35: 2021-05-26 233 arimax_prediction 437.6150
## 36: 2021-05-27 268 arimax_prediction 437.6150
## 37: 2021-05-28 258 arimax_prediction 437.6150
## 38: 2021-05-29 286 arimax_prediction 437.6150
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 374.8421 133.5021 0.3561555 -0.3544551 0.6006880
## 2: arimax_prediction 19 374.8421 133.5021 0.3561555 -0.1674650 0.4587804
## RMSE MAD MADP WMAPE
## 1: 185.7820 152.4055 0.4065859 0.4065859
## 2: 144.3094 122.1092 0.3257618 0.3257618
View of data shows that arima prediction is better in terms of MAPE. #Product 6
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(1,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product6
forecasted = forecast(fitted, xreg = tail(basket_count_diff_mat_p6[!is.na(basket_count_diff_mat_p6)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product7_data[event_date <= current_date]
forecast_data = product7_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 126 arima_prediction 49.75387
## 2: 2021-05-12 134 arima_prediction 49.75066
## 3: 2021-05-13 170 arima_prediction 49.82353
## 4: 2021-05-14 178 arima_prediction 49.88569
## 5: 2021-05-15 222 arima_prediction 49.85985
## 6: 2021-05-16 98 arima_prediction 50.05962
## 7: 2021-05-17 72 arima_prediction 50.04828
## 8: 2021-05-18 94 arima_prediction 50.00268
## 9: 2021-05-19 85 arima_prediction 50.03672
## 10: 2021-05-20 77 arima_prediction 50.04390
## 11: 2021-05-21 95 arima_prediction 49.87351
## 12: 2021-05-22 104 arima_prediction 49.92735
## 13: 2021-05-23 114 arima_prediction 49.88775
## 14: 2021-05-24 146 arima_prediction 49.91781
## 15: 2021-05-25 85 arima_prediction 50.08544
## 16: 2021-05-26 125 arima_prediction 50.03131
## 17: 2021-05-27 71 arima_prediction 50.10780
## 18: 2021-05-28 52 arima_prediction 50.11470
## 19: 2021-05-29 67 arima_prediction 50.05460
## 20: 2021-05-11 126 arimax_prediction 40.94258
## 21: 2021-05-12 134 arimax_prediction 40.94258
## 22: 2021-05-13 170 arimax_prediction 40.94258
## 23: 2021-05-14 178 arimax_prediction 40.94258
## 24: 2021-05-15 222 arimax_prediction 40.94258
## 25: 2021-05-16 98 arimax_prediction 40.94258
## 26: 2021-05-17 72 arimax_prediction 40.94258
## 27: 2021-05-18 94 arimax_prediction 40.94258
## 28: 2021-05-19 85 arimax_prediction 40.94258
## 29: 2021-05-20 77 arimax_prediction 40.94258
## 30: 2021-05-21 95 arimax_prediction 40.94258
## 31: 2021-05-22 104 arimax_prediction 40.94258
## 32: 2021-05-23 114 arimax_prediction 40.94258
## 33: 2021-05-24 146 arimax_prediction 40.94258
## 34: 2021-05-25 85 arimax_prediction 40.94258
## 35: 2021-05-26 125 arimax_prediction 40.94258
## 36: 2021-05-27 71 arimax_prediction 40.94258
## 37: 2021-05-28 52 arimax_prediction 40.94258
## 38: 2021-05-29 67 arimax_prediction 40.94258
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 111.3158 43.64128 0.3920493 0.5511749 0.4878333
## 2: arimax_prediction 19 111.3158 43.64128 0.3920493 0.6321944 0.5805426
## RMSE MAD MADP WMAPE
## 1: 74.66883 61.35447 0.5511749 0.5511749
## 2: 82.19921 70.37321 0.6321944 0.6321944
View of data shows that arima prediction is better in terms of MAPE.
#Product 7
We define our functions for arima and arimax models.
product7_arima_model
## Series: product7_dec_daily$random
## ARIMA(1,0,2)(2,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 ma2 sar1 sar2 sma1
## -0.5086 0.8848 0.3308 -0.4717 -0.0636 0.5021
## s.e. 0.1020 0.0937 0.0239 0.1614 0.0238 0.1614
##
## sigma^2 estimated as 1046: log likelihood=-12685.84
## AIC=25385.69 AICc=25385.73 BIC=25426.71
arimax_product7
## Series: product7_data$sold_count
## Regression with ARIMA(3,0,2) errors
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 intercept xreg
## 0.9463 -0.7317 0.3664 -0.1640 0.7235 53.9890 0.0140
## s.e. 0.1536 0.2444 0.1359 0.1338 0.1285 8.3913 0.0028
##
## sigma^2 estimated as 1515: log likelihood=-1886.99
## AIC=3789.98 AICc=3790.37 BIC=3821.33
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(1, 0, 2), seasonal = c(2,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product7
forecasted = forecast(fitted, xreg = tail(visit_count_mat_p7[!is.na(visit_count_mat_p7)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product7_data[event_date <= current_date]
forecast_data = product7_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 126 arima_prediction 50.35676
## 2: 2021-05-12 134 arima_prediction 50.30671
## 3: 2021-05-13 170 arima_prediction 50.37569
## 4: 2021-05-14 178 arima_prediction 50.46844
## 5: 2021-05-15 222 arima_prediction 49.54439
## 6: 2021-05-16 98 arima_prediction 49.68743
## 7: 2021-05-17 72 arima_prediction 49.69112
## 8: 2021-05-18 94 arima_prediction 50.58876
## 9: 2021-05-19 85 arima_prediction 50.57940
## 10: 2021-05-20 77 arima_prediction 49.63221
## 11: 2021-05-21 95 arima_prediction 49.48800
## 12: 2021-05-22 104 arima_prediction 49.57158
## 13: 2021-05-23 114 arima_prediction 49.52115
## 14: 2021-05-24 146 arima_prediction 49.54223
## 15: 2021-05-25 85 arima_prediction 49.67193
## 16: 2021-05-26 125 arima_prediction 49.59822
## 17: 2021-05-27 71 arima_prediction 49.65434
## 18: 2021-05-28 52 arima_prediction 49.68561
## 19: 2021-05-29 67 arima_prediction 49.61904
## 20: 2021-05-11 126 arimax_prediction 45.17516
## 21: 2021-05-12 134 arimax_prediction 45.17516
## 22: 2021-05-13 170 arimax_prediction 45.17516
## 23: 2021-05-14 178 arimax_prediction 45.17516
## 24: 2021-05-15 222 arimax_prediction 45.17516
## 25: 2021-05-16 98 arimax_prediction 45.17516
## 26: 2021-05-17 72 arimax_prediction 45.17516
## 27: 2021-05-18 94 arimax_prediction 45.17516
## 28: 2021-05-19 85 arimax_prediction 45.17516
## 29: 2021-05-20 77 arimax_prediction 45.17516
## 30: 2021-05-21 95 arimax_prediction 45.17516
## 31: 2021-05-22 104 arimax_prediction 45.17516
## 32: 2021-05-23 114 arimax_prediction 45.17516
## 33: 2021-05-24 146 arimax_prediction 45.17516
## 34: 2021-05-25 85 arimax_prediction 45.17516
## 35: 2021-05-26 125 arimax_prediction 45.17516
## 36: 2021-05-27 71 arimax_prediction 45.17516
## 37: 2021-05-28 52 arimax_prediction 45.17516
## 38: 2021-05-29 67 arimax_prediction 45.17516
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE RMSE
## 1: arima_prediction 19 111.3158 43.64128 0.3920493 0.5519702 0.4894221 74.6514
## 2: arimax_prediction 19 111.3158 43.64128 0.3920493 0.5941711 0.5371797 78.6060
## MAD MADP WMAPE
## 1: 61.44300 0.5519702 0.5519702
## 2: 66.14063 0.5941711 0.5941711
View of data shows that arima prediction is better in terms of MAPE.
#Product 8
We define our functions for arima and arimax models.
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(0, 0, 1), seasonal = c(0,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product8
forecasted = forecast(fitted, xreg = tail(category_sold_mat_p8[!is.na(category_sold_mat_p8)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product8_data[event_date <= current_date]
forecast_data = product8_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 204 arima_prediction 273.36571
## 2: 2021-05-12 263 arima_prediction 272.91825
## 3: 2021-05-13 88 arima_prediction 272.91297
## 4: 2021-05-14 128 arima_prediction 272.01190
## 5: 2021-05-15 144 arima_prediction 271.90366
## 6: 2021-05-16 198 arima_prediction 271.44876
## 7: 2021-05-17 940 arima_prediction 271.28317
## 8: 2021-05-18 854 arima_prediction 273.06861
## 9: 2021-05-19 611 arima_prediction 273.34389
## 10: 2021-05-20 768 arima_prediction 273.65636
## 11: 2021-05-21 581 arima_prediction 274.35769
## 12: 2021-05-22 606 arima_prediction 274.39089
## 13: 2021-05-23 727 arima_prediction 275.08719
## 14: 2021-05-24 632 arima_prediction 275.43295
## 15: 2021-05-25 568 arima_prediction 275.75340
## 16: 2021-05-26 589 arima_prediction 276.03369
## 17: 2021-05-27 588 arima_prediction 276.37822
## 18: 2021-05-28 538 arima_prediction 276.66095
## 19: 2021-05-29 528 arima_prediction 276.82738
## 20: 2021-05-11 204 arimax_prediction 37.42197
## 21: 2021-05-12 263 arimax_prediction 37.42197
## 22: 2021-05-13 88 arimax_prediction 37.42197
## 23: 2021-05-14 128 arimax_prediction 37.42197
## 24: 2021-05-15 144 arimax_prediction 37.42197
## 25: 2021-05-16 198 arimax_prediction 37.42197
## 26: 2021-05-17 940 arimax_prediction 37.42197
## 27: 2021-05-18 854 arimax_prediction 37.42197
## 28: 2021-05-19 611 arimax_prediction 37.42197
## 29: 2021-05-20 768 arimax_prediction 37.42197
## 30: 2021-05-21 581 arimax_prediction 37.42197
## 31: 2021-05-22 606 arimax_prediction 37.42197
## 32: 2021-05-23 727 arimax_prediction 37.42197
## 33: 2021-05-24 632 arimax_prediction 37.42197
## 34: 2021-05-25 568 arimax_prediction 37.42197
## 35: 2021-05-26 589 arimax_prediction 37.42197
## 36: 2021-05-27 588 arimax_prediction 37.42197
## 37: 2021-05-28 538 arimax_prediction 37.42197
## 38: 2021-05-29 528 arimax_prediction 37.42197
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 502.8947 256.2574 0.5095648 0.4550669 0.6447023
## 2: arimax_prediction 19 502.8947 256.2574 0.5095648 0.9255869 0.8812548
## RMSE MAD MADP WMAPE
## 1: 338.0950 293.0151 0.5826569 0.5826569
## 2: 528.0877 465.4728 0.9255869 0.9255869
View of data shows that arima prediction is better in terms of MAPE.
#Product 9
We define our functions for arima and arimax models.
product9_arima_model
## Series: product9_dec_daily$random
## ARIMA(1,0,1)(1,0,1)[7] with zero mean
##
## Coefficients:
## ar1 ma1 sar1 sma1
## -0.0651 0.4105 -0.0801 0.2063
## s.e. 0.0385 0.0325 0.1519 0.1494
##
## sigma^2 estimated as 1137: log likelihood=-12795.32
## AIC=25600.64 AICc=25600.66 BIC=25629.94
# forecast with ARIMA models
forecast_with_arima = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arima(input_series, order = c(1, 0, 1), seasonal = c(1,
0, 1))
forecasted = forecast(fitted, h = forecast_ahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# forecast with ARIMAX models
forecast_with_arimax = function(data, forecast_ahead, target_name = "sold_count",
is_seasonal = F, is_stepwise = F, is_trace = T, is_approx = F) {
command_string = sprintf("input_series=data$%s", target_name)
print(command_string)
eval(parse(text = command_string))
fitted = arimax_product9
forecasted = forecast(fitted, xreg = tail(price_diff_mat_p9[!is.na(price_diff_mat_p9)],
1), h = nahead)
return(list(forecast = as.numeric(forecasted$mean), model = fitted))
}
# loop over the test dates
forecast_ahead = 1
results = vector("list", length(test_dates))
i = 1
for (i in 1:length(test_dates)) {
current_date = test_dates[i] - forecast_ahead
print(test_dates[i])
past_data = product8_data[event_date <= current_date]
forecast_data = product8_data[event_date == test_dates[i]]
arima_forecast = forecast_with_arima(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arima_prediction, arima_forecast$forecast)]
arimax_forecast = forecast_with_arimax(past_data, forecast_ahead,
"sold_count", is_trace = F)
forecast_data[, `:=`(arimax_prediction, arimax_forecast$forecast)]
results[[i]] = forecast_data
}
## [1] "2021-05-11"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-12"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-13"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-14"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-15"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-16"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-17"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-18"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-19"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-20"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-21"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-22"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-23"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-24"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-25"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-26"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-27"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-28"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
## [1] "2021-05-29"
## [1] "input_series=data$sold_count"
## [1] "input_series=data$sold_count"
overall_results = rbindlist(results)
melted_result = melt(overall_results, c("event_date", "sold_count"),
c("arima_prediction", "arimax_prediction"))
We turned the results into melted results(long form).
melted_result
## event_date sold_count variable value
## 1: 2021-05-11 204 arima_prediction 220.0599
## 2: 2021-05-12 263 arima_prediction 219.7993
## 3: 2021-05-13 88 arima_prediction 219.8191
## 4: 2021-05-14 128 arima_prediction 219.0358
## 5: 2021-05-15 144 arima_prediction 218.8570
## 6: 2021-05-16 198 arima_prediction 218.6456
## 7: 2021-05-17 940 arima_prediction 218.5281
## 8: 2021-05-18 854 arima_prediction 220.7967
## 9: 2021-05-19 611 arima_prediction 220.8536
## 10: 2021-05-20 768 arima_prediction 220.7399
## 11: 2021-05-21 581 arima_prediction 221.2707
## 12: 2021-05-22 606 arima_prediction 221.1768
## 13: 2021-05-23 727 arima_prediction 221.3741
## 14: 2021-05-24 632 arima_prediction 221.8013
## 15: 2021-05-25 568 arima_prediction 221.8081
## 16: 2021-05-26 589 arima_prediction 221.8768
## 17: 2021-05-27 588 arima_prediction 222.0597
## 18: 2021-05-28 538 arima_prediction 222.2066
## 19: 2021-05-29 528 arima_prediction 222.2686
## 20: 2021-05-11 204 arimax_prediction 63.9130
## 21: 2021-05-12 263 arimax_prediction 63.9130
## 22: 2021-05-13 88 arimax_prediction 63.9130
## 23: 2021-05-14 128 arimax_prediction 63.9130
## 24: 2021-05-15 144 arimax_prediction 63.9130
## 25: 2021-05-16 198 arimax_prediction 63.9130
## 26: 2021-05-17 940 arimax_prediction 63.9130
## 27: 2021-05-18 854 arimax_prediction 63.9130
## 28: 2021-05-19 611 arimax_prediction 63.9130
## 29: 2021-05-20 768 arimax_prediction 63.9130
## 30: 2021-05-21 581 arimax_prediction 63.9130
## 31: 2021-05-22 606 arimax_prediction 63.9130
## 32: 2021-05-23 727 arimax_prediction 63.9130
## 33: 2021-05-24 632 arimax_prediction 63.9130
## 34: 2021-05-25 568 arimax_prediction 63.9130
## 35: 2021-05-26 589 arimax_prediction 63.9130
## 36: 2021-05-27 588 arimax_prediction 63.9130
## 37: 2021-05-28 538 arimax_prediction 63.9130
## 38: 2021-05-29 528 arimax_prediction 63.9130
## event_date sold_count variable value
accu = function(actual, forecast) {
n = length(actual)
error = actual - forecast
mean = mean(actual)
sd = sd(actual)
CV = sd/mean
FBias = sum(error)/sum(actual)
MAPE = sum(abs(error/actual))/n
RMSE = sqrt(sum(error^2)/n)
MAD = sum(abs(error))/n
MADP = sum(abs(error))/sum(abs(actual))
WMAPE = MAD/mean
l = data.frame(n, mean, sd, CV, FBias, MAPE, RMSE, MAD, MADP,
WMAPE)
return(l)
}
To compare models,we accumulate errors of our models in terms of statistical methods with our accumulation function we defined above.
performance = melted_result[, accu(sold_count, value), by = list(variable)]
performance
## variable n mean sd CV FBias MAPE
## 1: arima_prediction 19 502.8947 256.2574 0.5095648 0.5611745 0.6082225
## 2: arimax_prediction 19 502.8947 256.2574 0.5095648 0.8729098 0.7971949
## RMSE MAD MADP WMAPE
## 1: 376.2800 317.4135 0.6311729 0.6311729
## 2: 504.8927 438.9817 0.8729098 0.8729098
View of data shows that arima prediction is better in terms of MAPE.